In [1]:
from anndata import AnnData
import scrublet as scr
import pandas as pd
import scanpy as sc
import numpy as np
import rbcde
import scipy
import os

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-N1-endometrium-integration/'
sc.logging.print_versions()
/usr/local/lib/python3.6/dist-packages/dask/config.py:161: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.
  data = yaml.load(f.read()) or {}
scanpy==1.4.5.post2 anndata==0.6.22.post1 umap==0.3.10 numpy==1.18.1 scipy==1.4.1 pandas==0.25.3 scikit-learn==0.22.1 statsmodels==0.11.0rc2 python-igraph==0.7.1 louvain==0.6.1
In [2]:
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

Import existing objects and rebuild raw form. There are some holes in their metadata, fill them up. Also, fix up some stuff that was flawed in the original metadata CSVs.

In [3]:
bdata1 = sc.read('../N1-cbtm/endometrium-N2-10x.h5ad')
bdata1 = AnnData(bdata1.raw.X, obs=bdata1.obs, var=bdata1.raw.var)
bdata2 = sc.read('../N2-biopsies/endometrium-N2-10x.h5ad')
bdata2 = AnnData(bdata2.raw.X, obs=bdata2.obs, var=bdata2.raw.var)

bdata1.obs['clinical'] = 'U'
bdata1.obs['day'] = 'U'
bdata1.obs['type'] = 'CBTM'
#this one sample has a T, the rest are C
bdata1.obs['treatment'] = ['T' if '4861STDY7771123' in i else 'C' for i in bdata1.obs_names]
#redo all EN to ENMY for bdata1
bdata1.obs['location'] = [str(i) for i in bdata1.obs['location']]
bdata1.obs.loc[bdata1.obs['location']=='EN', 'location'] = 'ENMY'
#biopsies use a more collapsed form of this
bdata1.obs['phase'] = ['P' if i=='proliferative' else 'S' for i in bdata1.obs['phase']]
#oh yeah and a few individuals were apparently misannotated in this category in the metadata
bdata1.obs.loc[[i in ['A13','A16'] for i in bdata1.obs['individual']], 'phase'] = 'P'
bdata2.obs['location'] = 'EN'

Merge the two objects, and store the raw form in .raw. Filter out unexpressed genes, recompute mitochondrial percentage, then obtain log(CPM/100 + 1) form of data and store that in .X.

In [4]:
adata = bdata1.concatenate(bdata2, index_unique=None)
adata.raw = adata.copy()

sc.pp.filter_genes(adata, min_cells=3)
#convert to lower to be species agnostic: human mito start with MT-, mouse with mt-
mito_genes = [name for name in adata.var_names if name.lower().startswith('mt-')]
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
filtered out 7109 genes that are detected in less than 3 cells
normalizing by total count per cell
    finished (0:00:14): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)

The rest of the analysis will be analogous to what was done for the CBTM/biopsy separately.

Remove cell cycle genes by inverting the object and clustering the gene space, reporting back the cluster with known cycling genes as members.

In [5]:
bdata = adata.copy()
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
bdata = bdata[:, bdata.var['highly_variable']]
bdata = bdata.copy().T
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack')
sc.pl.pca_variance_ratio(bdata, log=True, save='_ccg_identification.pdf')
extracting highly variable genes
    finished (0:00:08)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
computing PCA with n_comps = 50
    finished (0:00:07)
WARNING: saving figure to file figures-N1-endometrium-integration/pca_variance_ratio_ccg_identification.pdf
In [6]:
num_pcs = 20

sc.pp.neighbors(bdata,n_pcs=num_pcs)
sc.tl.umap(bdata)
sc.tl.leiden(bdata, resolution=0.4)
bdata.obs['known_cyclers'] = [i in ['CDK1','MKI67','CCNB2','PCNA'] for i in bdata.obs_names]
sc.pl.umap(bdata,color=['leiden','known_cyclers'],color_map='OrRd',save='_ccg_identification.pdf')

print(bdata.obs.loc[['CDK1','MKI67','CCNB2','PCNA'],'leiden'])
computing neighbors
    using 'X_pca' with n_pcs = 20
    finished: added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix (0:00:03)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:07)
running Leiden clustering
    finished: found 11 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
... storing 'clinical' as categorical
... storing 'day' as categorical
... storing 'individual' as categorical
... storing 'leiden' as categorical
... storing 'location' as categorical
... storing 'phase' as categorical
... storing 'sample' as categorical
... storing 'treatment' as categorical
... storing 'type' as categorical
WARNING: saving figure to file figures-N1-endometrium-integration/umap_ccg_identification.pdf
CDK1      10
MKI67     10
CCNB2     10
PCNA     NaN
Name: leiden, dtype: category
Categories (11, object): [0, 1, 2, 3, ..., 7, 8, 9, 10]
/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py:961: FutureWarning: 
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]

With the cycling genes identified, clean up the plotting folder by moving all related plots to a subfolder.

In [7]:
def MovePlots(plotpattern, subplotdir):
    if not os.path.exists(str(sc.settings.figdir)+subplotdir):
        os.makedirs(str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'*.* '+str(sc.settings.figdir)+'/'+subplotdir)

MovePlots('ccg_identification','ccg_identification')

Flag cell cycling genes and continue standard pipeline in a helper object until PCA.

In [8]:
adata.var['is_ccg'] = [i in bdata.obs[bdata.obs['leiden']=='10'].index for i in adata.var_names]
#compute HVG and PCA on separate helper object and port the results back to the original one
bdata = adata[:, ~adata.var['is_ccg']]
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
for col in ['highly_variable','means', 'dispersions', 'dispersions_norm']:
    adata.var[col] = bdata.var[col]
#fill NaNs with False so that subsetting to HVGs is possible
adata.var['highly_variable'].fillna(value=False, inplace=True)
bdata = bdata[:, bdata.var['highly_variable']]
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack', n_comps=50)
adata.obsm['X_pca'] = bdata.obsm['X_pca'].copy()
adata.uns['pca'] = bdata.uns['pca'].copy()
adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, 50))
adata.varm['PCs'][adata.var['highly_variable']] = bdata.varm['PCs']
sc.pl.pca_variance_ratio(adata, log=True, save='.pdf')
extracting highly variable genes
    finished (0:00:05)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
Trying to set attribute `.var` of view, making a copy.
/usr/local/lib/python3.6/dist-packages/scanpy/preprocessing/_simple.py:909: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
computing PCA with n_comps = 50
    on highly variable genes
    finished (0:00:15)
WARNING: saving figure to file figures-N1-endometrium-integration/pca_variance_ratio.pdf

Correct cross-individual batch effects with Harmony.

In [9]:
num_pcs = 20

pca = adata.obsm['X_pca'][:, :num_pcs]
batch = adata.obs['individual']
In [10]:
%load_ext rpy2.ipython
In [11]:
%%R -i pca -i batch -o hem

library(harmony)
library(magrittr)

#this gets "90% of the way" to determinism. better than nothing.
set.seed(1)

hem = HarmonyMatrix(pca, batch, theta=1, verbose=FALSE, do_pca=FALSE)
hem = data.frame(hem)
R[write to console]: Loading required package: Rcpp

Compute clustering and UMAP, plot metadata, clustering and individual location UMAPs.

In [12]:
adata.obsm['X_harmony'] = hem.values
sc.pp.neighbors(adata, use_rep='X_harmony')
sc.tl.leiden(adata, resolution = 0.5)
sc.tl.umap(adata)

#need to set this up manually as the different metadata csvs are not fully compatible
plotmeta = ['individual','location','phase','clinical','day','type','treatment','sample']

sc.pl.umap(adata, color=plotmeta, save='.pdf')
sc.pl.umap(adata, color='leiden', save='_clustering.pdf')
sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_clustering_clusnumbers.pdf')
computing neighbors
/usr/local/lib/python3.6/dist-packages/numba/typed_passes.py:293: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../usr/local/lib/python3.6/dist-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
/usr/local/lib/python3.6/dist-packages/umap/nndescent.py:92: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../usr/local/lib/python3.6/dist-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^

  current_graph, n_vertices, n_neighbors, max_candidates, rng_state
/usr/local/lib/python3.6/dist-packages/numba/typed_passes.py:293: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../usr/local/lib/python3.6/dist-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
    finished: added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix (0:00:24)
running Leiden clustering
    finished: found 15 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:27)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:38)
... storing 'clinical' as categorical
... storing 'day' as categorical
... storing 'individual' as categorical
... storing 'location' as categorical
... storing 'phase' as categorical
... storing 'sample' as categorical
... storing 'treatment' as categorical
... storing 'type' as categorical
WARNING: saving figure to file figures-N1-endometrium-integration/umap.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap_clustering.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap_clustering_clusnumbers.pdf

Plot canonical markers for ease of annotation.

In [13]:
with open('markers.txt','r') as fid:
    markers = np.unique([line.rstrip() for line in fid.readlines()])

#make sure they're in the dataset, and sort them alphabetically for ease of finding things
markers = sorted([item for item in markers if item in adata.var_names])

for i in np.arange(np.ceil(len(markers)/4)):
    sc.pl.umap(adata, color=markers[int(i*4):int((i+1)*4)], use_raw=False, save='-markers'+str(int(i+1))+'.pdf', color_map='OrRd')

#goodbye clutter!
MovePlots('markers','markers')
WARNING: saving figure to file figures-N1-endometrium-integration/umap-markers1.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap-markers2.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap-markers3.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap-markers4.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap-markers5.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap-markers6.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap-markers7.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap-markers8.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap-markers9.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap-markers10.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/umap-markers11.pdf

Identify cluster markers via the rank-biserial correlation, an effect size equivalent of the Wilcoxon test. Threshold the coefficient with the literature critical value for a large effect (0.5), and possibly lower the stringency if any clusters have fewer than five markers. Write out resulting marker list to CSV.

In [18]:
rbcde.RBC(adata, use_raw=False)

plot_dict = {}
degs = pd.DataFrame(columns=['cluster','RBC'])
for thresh in [0.5,0.3,0.2,0.1]:
    degs_temp, temp_dict = rbcde.filter_markers(adata, use_raw=False, thresh=thresh)
    for clus in temp_dict:
        if len(temp_dict[clus])>=5 and clus not in plot_dict:
            plot_dict[clus] = temp_dict[clus]
            degs = pd.concat([degs, degs_temp.loc[degs_temp['cluster']==clus,:]])
    if set(plot_dict.keys()) == set(temp_dict.keys()):
        break
degs.to_csv(str(sc.settings.figdir)+'/cluster_markers_rbc.csv')
--> 36 markers found for cluster 0
--> 26 markers found for cluster 1
--> 101 markers found for cluster 10
--> 48 markers found for cluster 11
--> 479 markers found for cluster 12
--> 28 markers found for cluster 13
--> 140 markers found for cluster 14
--> 99 markers found for cluster 2
--> 20 markers found for cluster 3
--> 68 markers found for cluster 4
--> 97 markers found for cluster 5
--> 9 markers found for cluster 6
--> 33 markers found for cluster 7
--> 27 markers found for cluster 8
--> 122 markers found for cluster 9

Do dotplots (for top 100 in the interest of legibility, if need be) and more detailed exports.

In [19]:
adata_count = AnnData(np.expm1(adata.X), var=adata.var, obs=adata.obs)
for clus in plot_dict:
    degs_clus = degs.loc[degs['cluster']==clus,:].copy()
    degs_clus['log2_FC'] = 0
    degs_clus['percent_cluster'] = 0
    degs_clus['percent_rest'] = 0
    adata_clus = adata_count[adata.obs['leiden']==clus]
    adata_rest = adata_count[[not i for i in adata.obs['leiden']==clus]]
    adata_clus = adata_clus[:, degs_clus.index]
    adata_rest = adata_rest[:, degs_clus.index]
    degs_clus['log2_FC'] = np.asarray(np.log2(np.mean(adata_clus.X,axis=0)/np.mean(adata_rest.X,axis=0))).reshape(-1)
    #are you expressed?
    adata_clus.X.data = adata_clus.X.data > 0
    adata_rest.X.data = adata_rest.X.data > 0
    degs_clus['percent_cluster'] = np.asarray(100*np.sum(adata_clus.X,axis=0)/adata_clus.shape[0]).reshape(-1)
    degs_clus['percent_rest'] = np.asarray(100*np.sum(adata_rest.X,axis=0)/adata_rest.shape[0]).reshape(-1)
    degs_clus.to_csv(str(sc.settings.figdir)+'/cluster_markers_'+clus+'.csv')
    sc.pl.dotplot(adata, {clus: plot_dict[clus][:100]}, groupby='leiden', use_raw=False, save='_cluster_markers_'+clus+'.pdf')

#goodbye clutter!
MovePlots('cluster_markers','cluster_markers')
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_0.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_1.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_10.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_11.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_12.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_13.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_14.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_2.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_3.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_4.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_5.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_6.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_7.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_8.pdf
WARNING: saving figure to file figures-N1-endometrium-integration/dotplot_cluster_markers_9.pdf

And with that, think we're good. Save the object for later.

In [20]:
adata.write('endometrium-N1-integration.h5ad')